- ML Model Training
- Training machine learning models involves selecting appropriate algorithms, preparing data, and optimizing model parameters to achieve strong predictive performance.
- Training Phases
- Data Preparation
-
- Cleaning, encoding, normalization
- Feature Engineering
-
- Creating meaningful features
- Model Selection
-
- Choosing appropriate algorithms
- Hyperparameter Tuning
-
- Optimizing model settings
- Validation
-
- Cross-validation and evaluation metrics
- Deployment
-
- Preparing models for production
- Common Algorithms
- Regression
-
- Linear, Ridge, Lasso, Random Forest
- Classification
-
- Logistic, SVM, Random Forest, Gradient Boosting
- Clustering
-
- K-Means, DBSCAN, Hierarchical
- Neural Networks
- MLPs, CNNs, RNNs, Transformers Python Implementation import numpy as np import pandas as pd import matplotlib . pyplot as plt from sklearn . model_selection import train_test_split , cross_val_score from sklearn . preprocessing import StandardScaler from sklearn . ensemble import RandomForestClassifier , GradientBoostingClassifier from sklearn . linear_model import LogisticRegression from sklearn . metrics import ( accuracy_score , precision_score , recall_score , f1_score , confusion_matrix , roc_auc_score ) import torch import torch . nn as nn from torch . utils . data import DataLoader , TensorDataset import tensorflow as tf from tensorflow import keras
1. Generate synthetic dataset
np . random . seed ( 42 ) n_samples = 1000 n_features = 20 X = np . random . randn ( n_samples , n_features ) y = ( X [ : , 0 ] + X [ : , 1 ] - X [ : , 2 ] + np . random . randn ( n_samples ) * 0.5
0 ) . astype ( int )
Split data
X_train , X_test , y_train , y_test = train_test_split ( X , y , test_size = 0.2 , random_state = 42 )
Normalize features
scaler
StandardScaler ( ) X_train_scaled = scaler . fit_transform ( X_train ) X_test_scaled = scaler . transform ( X_test ) print ( "Dataset shapes:" ) print ( f"Training: { X_train_scaled . shape } , Testing: { X_test_scaled . shape } " ) print ( f"Class distribution: { np . bincount ( y_train ) } " )
2. Scikit-learn models
print ( "\n=== Scikit-learn Models ===" ) models = { 'Logistic Regression' : LogisticRegression ( max_iter = 1000 ) , 'Random Forest' : RandomForestClassifier ( n_estimators = 100 , random_state = 42 ) , 'Gradient Boosting' : GradientBoostingClassifier ( n_estimators = 100 , random_state = 42 ) , } sklearn_results = { } for name , model in models . items ( ) : model . fit ( X_train_scaled , y_train ) y_pred = model . predict ( X_test_scaled ) y_pred_proba = model . predict_proba ( X_test_scaled ) [ : , 1 ] sklearn_results [ name ] = { 'accuracy' : accuracy_score ( y_test , y_pred ) , 'precision' : precision_score ( y_test , y_pred ) , 'recall' : recall_score ( y_test , y_pred ) , 'f1' : f1_score ( y_test , y_pred ) , 'roc_auc' : roc_auc_score ( y_test , y_pred_proba ) } print ( f"\n { name } :" ) for metric , value in sklearn_results [ name ] . items ( ) : print ( f" { metric } : { value : .4f } " )
3. PyTorch neural network
print ( "\n=== PyTorch Model ===" ) class NeuralNetPyTorch ( nn . Module ) : def init ( self , input_size ) : super ( ) . init ( ) self . fc1 = nn . Linear ( input_size , 64 ) self . fc2 = nn . Linear ( 64 , 32 ) self . fc3 = nn . Linear ( 32 , 1 ) self . relu = nn . ReLU ( ) self . dropout = nn . Dropout ( 0.3 ) def forward ( self , x ) : x = self . relu ( self . fc1 ( x ) ) x = self . dropout ( x ) x = self . relu ( self . fc2 ( x ) ) x = self . dropout ( x ) x = torch . sigmoid ( self . fc3 ( x ) ) return x device = torch . device ( 'cuda' if torch . cuda . is_available ( ) else 'cpu' ) pytorch_model = NeuralNetPyTorch ( n_features ) . to ( device ) criterion = nn . BCELoss ( ) optimizer = torch . optim . Adam ( pytorch_model . parameters ( ) , lr = 0.001 )
Create data loaders
train_dataset
TensorDataset ( torch . FloatTensor ( X_train_scaled ) , torch . FloatTensor ( y_train ) . unsqueeze ( 1 ) ) train_loader = DataLoader ( train_dataset , batch_size = 32 , shuffle = True )
Train PyTorch model
epochs
50 pytorch_losses = [ ] for epoch in range ( epochs ) : total_loss = 0 for batch_X , batch_y in train_loader : batch_X , batch_y = batch_X . to ( device ) , batch_y . to ( device ) optimizer . zero_grad ( ) outputs = pytorch_model ( batch_X ) loss = criterion ( outputs , batch_y ) loss . backward ( ) optimizer . step ( ) total_loss += loss . item ( ) pytorch_losses . append ( total_loss / len ( train_loader ) ) if ( epoch + 1 ) % 10 == 0 : print ( f"Epoch { epoch + 1 } / { epochs } , Loss: { pytorch_losses [ - 1 ] : .4f } " )
Evaluate PyTorch
pytorch_model . eval ( ) with torch . no_grad ( ) : y_pred_pytorch = pytorch_model ( torch . FloatTensor ( X_test_scaled ) . to ( device ) ) y_pred_pytorch = ( y_pred_pytorch . cpu ( ) . numpy ( )
0.5 ) . astype ( int ) . flatten ( ) print ( f"\nPyTorch Accuracy: { accuracy_score ( y_test , y_pred_pytorch ) : .4f } " )
4. TensorFlow/Keras model
print ( "\n=== TensorFlow/Keras Model ===" ) tf_model = keras . Sequential ( [ keras . layers . Dense ( 64 , activation = 'relu' , input_shape = ( n_features , ) ) , keras . layers . Dropout ( 0.3 ) , keras . layers . Dense ( 32 , activation = 'relu' ) , keras . layers . Dropout ( 0.3 ) , keras . layers . Dense ( 1 , activation = 'sigmoid' ) ] ) tf_model . compile ( optimizer = 'adam' , loss = 'binary_crossentropy' , metrics = [ 'accuracy' ] ) history = tf_model . fit ( X_train_scaled , y_train , batch_size = 32 , epochs = 50 , validation_split = 0.2 , verbose = 0 ) y_pred_tf = ( tf_model . predict ( X_test_scaled )
0.5 ) . astype ( int ) . flatten ( ) print ( f"TensorFlow Accuracy: { accuracy_score ( y_test , y_pred_tf ) : .4f } " )
5. Visualization
fig , axes = plt . subplots ( 2 , 2 , figsize = ( 12 , 10 ) )
Model comparison
models_names
list ( sklearn_results . keys ( ) ) + [ 'PyTorch' , 'TensorFlow' ] accuracies = [ sklearn_results [ m ] [ 'accuracy' ] for m in sklearn_results . keys ( ) ] + \ [ accuracy_score ( y_test , y_pred_pytorch ) , accuracy_score ( y_test , y_pred_tf ) ] axes [ 0 , 0 ] . bar ( range ( len ( models_names ) ) , accuracies , color = 'steelblue' ) axes [ 0 , 0 ] . set_xticks ( range ( len ( models_names ) ) ) axes [ 0 , 0 ] . set_xticklabels ( models_names , rotation = 45 ) axes [ 0 , 0 ] . set_ylabel ( 'Accuracy' ) axes [ 0 , 0 ] . set_title ( 'Model Comparison' ) axes [ 0 , 0 ] . set_ylim ( [ 0 , 1 ] )
Training loss curves
axes [ 0 , 1 ] . plot ( pytorch_losses , label = 'PyTorch' , linewidth = 2 ) axes [ 0 , 1 ] . plot ( history . history [ 'loss' ] , label = 'TensorFlow' , linewidth = 2 ) axes [ 0 , 1 ] . set_xlabel ( 'Epoch' ) axes [ 0 , 1 ] . set_ylabel ( 'Loss' ) axes [ 0 , 1 ] . set_title ( 'Training Loss Comparison' ) axes [ 0 , 1 ] . legend ( ) axes [ 0 , 1 ] . grid ( True , alpha = 0.3 )
Scikit-learn metrics
metrics
[ 'accuracy' , 'precision' , 'recall' , 'f1' ] rf_metrics = [ sklearn_results [ 'Random Forest' ] [ m ] for m in metrics ] axes [ 1 , 0 ] . bar ( metrics , rf_metrics , color = 'coral' ) axes [ 1 , 0 ] . set_ylabel ( 'Score' ) axes [ 1 , 0 ] . set_title ( 'Random Forest Metrics' ) axes [ 1 , 0 ] . set_ylim ( [ 0 , 1 ] )
Validation accuracy over epochs
- axes
- [
- 1
- ,
- 1
- ]
- .
- plot
- (
- history
- .
- history
- [
- 'accuracy'
- ]
- ,
- label
- =
- 'Training'
- ,
- linewidth
- =
- 2
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- plot
- (
- history
- .
- history
- [
- 'val_accuracy'
- ]
- ,
- label
- =
- 'Validation'
- ,
- linewidth
- =
- 2
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- set_xlabel
- (
- 'Epoch'
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- set_ylabel
- (
- 'Accuracy'
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- set_title
- (
- 'TensorFlow Training History'
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- legend
- (
- )
- axes
- [
- 1
- ,
- 1
- ]
- .
- grid
- (
- True
- ,
- alpha
- =
- 0.3
- )
- plt
- .
- tight_layout
- (
- )
- plt
- .
- savefig
- (
- 'model_training_comparison.png'
- ,
- dpi
- =
- 100
- ,
- bbox_inches
- =
- 'tight'
- )
- (
- "\nVisualization saved as 'model_training_comparison.png'"
- )
- (
- "\nModel training completed!"
- )
- Training Best Practices
- Data Split
-
- 70/15/15 for train/validation/test
- Scaling
-
- Normalize features before training
- Cross-validation
-
- Use K-fold for robust evaluation
- Early Stopping
-
- Prevent overfitting
- Class Balancing
-
- Handle imbalanced datasets
- Key Metrics
- Accuracy
-
- Overall correctness
- Precision
-
- Positive prediction accuracy
- Recall
-
- True positive detection rate
- F1 Score
-
- Harmonic mean of precision/recall
- ROC-AUC
- Threshold-independent metric Deliverables Trained model checkpoint Performance metrics on test set Feature importance analysis Learning curves Hyperparameter configuration Model evaluation report